First, we’ve gotta load in our data. This comes from my computer so disregard the file path, from the github, this would be “./data/census_2011_UK_OA.RData”
load("C:/Users/Finnmcoolr/Documents/GIS 3/Class Assignments/urban_analytics-master/10_Data_Reduction_Geodemographics/data/census_2011_UK_OA.RData")
Next, we must subset our data to pull census results for only Liverpool.
Census_2011_Count <- merge(Liverpool,Census_2011_Count_All,by="OA",all.x=TRUE)
head(OAC_Input_Lookup[,])
## VariableCode Type Denominator SubDomain Domain VariableDescription
## 1 k001 Count KS102EW0001 Population Age Demographic Age 0 to 4
## 2 k002 Count KS102EW0001 Population Age Demographic Age 5 to 14
## 3 k003 Count KS102EW0001 Population Age Demographic Age 25 to 44
## 4 k004 Count KS102EW0001 Population Age Demographic Age 45 to 64
## 5 k005 Count KS102EW0001 Population Age Demographic Age 65 to 89
## 6 k006 Count KS102EW0001 Population Age Demographic Age 90 and over
## England_Wales
## 1 KS102EW0002
## 2 KS102EW0003,KS102EW0004,KS102EW0005
## 3 KS102EW0010,KS102EW0011
## 4 KS102EW0012,KS102EW0013
## 5 KS102EW0014,KS102EW0015,KS102EW0016
## 6 KS102EW0017
The next phase of the analysis is to aggregate all variables for OAC subgroups into their respective supergroups. Looking at the data frame OAC_Input_Lookup above, we can see that the variable code given to each region has three OAC groups (under the england-wales column). We must write code to aggregate these.
#first we pull a column of all OA codes
OAC_Input <- as.data.frame(Census_2011_Count$OA)
colnames(OAC_Input) <- "OA"
# Create a loop that goes through every row in the OAC_Input_Lookup table
for (n in 1:nrow(OAC_Input_Lookup)){
# Pull the relevant variables for every row n that need to be aggregated (this is the OAC values within England_Wales)
select_vars <- OAC_Input_Lookup[n,"England_Wales"]
# Change the string to a list of these variables for searching
select_vars <- unlist(strsplit(paste(select_vars),","))
# Create name from variable code column for each n
vname <- OAC_Input_Lookup[n,"VariableCode"]
# Use a temporary data frame to create a sum of census data counting the relevant England_Wales variables from before, name it with the variable code name from the above section.
tmp <- data.frame(rowSums(Census_2011_Count[,select_vars, drop=FALSE]))
colnames(tmp) <- vname
# Add these new sums to the original OAC_Input column of names from the first step.
OAC_Input <- cbind(OAC_Input,tmp)
# Remove temporary objects
remove(list = c("vname","tmp"))
}
Now that we have aggregate variables in every OAC_Input, we first much pull the k35 variables off the OAC_Input to be calculated later:
OAC_Input$k035 <- NULL
Next, we must create our denomonators from the same census file.
#Load new list of OAC inputs to append other variables to
OAC_Input_den <- as.data.frame(Census_2011_Count$OA)
colnames(OAC_Input_den) <- "OA"
# Create a list of unique denominators
den_list <- unique(OAC_Input_Lookup[,"Denominator"])
den_list <- paste(den_list[den_list != ""])
# Select denominators
OAC_Input_den <- Census_2011_Count[,c("OA",den_list)]
We then merge these denomonators with the OAC_Input data frame containing our numerators, using our OA value as a key.
OAC_Input <- merge(OAC_Input,OAC_Input_den, by="OA")
Now that we have both numerators and denomonators, we create a data frame with both isolated.
To do this, we perform a similar task to above and loop through all OA values to create an initial list called K-Var:
K_Var <- OAC_Input_Lookup[OAC_Input_Lookup$Type == "Count",c(1,3)]
Then we perform the looping to match our percentages
# We set up another data frame with OA values to append everything to
OAC_Input_PCT_RATIO <- subset(OAC_Input, select = "OA")
# Loop loop loop
for (n in 1:nrow(K_Var)){
num <- paste(K_Var[n,"VariableCode"]) # Grab the code corresponding to our numerator
den <- paste(K_Var[n,"Denominator"]) # Grab the code corresponding to our denominator
tmp <- data.frame(OAC_Input[,num] / OAC_Input[,den] * 100) # Match this to OAC values to find our percentages for every variable
colnames(tmp) <- num #do the temporary name thing
OAC_Input_PCT_RATIO <- cbind(OAC_Input_PCT_RATIO,tmp) # Append to our above data frame with all the OA values
# Remove temporary objects
remove(list = c("tmp","num","den"))
}
This will give us a list of percentages for every variable within Liverpool. We can use this to perform our spatial analysis.
First, however, we need to add population density from the original census data:
#Extract density from original census data
tmp <- Census_2011_Count[,c("OA","KS101EW0008")]
colnames(tmp) <- c("OA","k007")
#Merge with percentages data frame
OAC_Input_PCT_RATIO <- merge(OAC_Input_PCT_RATIO,tmp,by="OA")
Finally, we must add in the SIR (standardized illness rate) that we removed earlier.
# Calculate rates of ill people 15 or less and greater than or equal to 65
ill_16_64 <- rowSums(Census_2011_Count[,c("KS301EW0005","KS301EW0006")]) # Ill people 16-64
ill_total <- rowSums(Census_2011_Count[,c("KS301EW0002","KS301EW0003")]) # All ill people
ill_L15_G65 <- ill_total - ill_16_64 # Ill people 15 or less and greater than or equal to 65
# Calculate total people 15 or less and greater than or equal to 65
t_pop_16_64 <- rowSums(Census_2011_Count[,c("KS102EW0007","KS102EW0008","KS102EW0009","KS102EW0010","KS102EW0011","KS102EW0012","KS102EW0013")]) # People 16-64
t_pop <- Census_2011_Count$KS101EW0001 # All people
t_pop_L15_G65 <- t_pop - t_pop_16_64 # All people 15 or less and greater than or equal to 65
# Calculate expected rate
ex_ill_16_64 <- t_pop_16_64 * (sum(ill_16_64)/sum(t_pop_16_64)) # Expected ill 16-64
ex_ill_L15_G65 <- t_pop_L15_G65 * (sum(ill_L15_G65)/sum(t_pop_L15_G65)) # Expected ill people 15 or less and greater than or equal to 65
ex_ill <- ex_ill_16_64 + ex_ill_L15_G65 # total expected ill people
# Ratio
SIR <- as.data.frame(ill_total / ex_ill * 100) # ratio between ill people and expected ill people
colnames(SIR) <- "k035"
# Merge data
OAC_Input_PCT_RATIO <- cbind(OAC_Input_PCT_RATIO,SIR)
# Remove unwanted objects
remove(list=c("SIR","ill_16_64","ill_total","ill_L15_G65","t_pop_16_64","t_pop","t_pop_L15_G65","ex_ill_16_64","ex_ill_L15_G65","ex_ill"))
Next, we must normalize and standardize all our proportions added into OAC_Input_PCT_RATIO. We use the inverse hyperbolic sine and then a range standardization.
# Inverse hyperbolic sine function
OAC_Input_PCT_RATIO_IHS <- log(OAC_Input_PCT_RATIO[,2:61]+sqrt(OAC_Input_PCT_RATIO[,2:61]^2+1))
# Range Standardization
range_01 <- function(x){(x-min(x))/(max(x)-min(x))} # range function
OAC_Input_PCT_RATIO_IHS_01 <- apply(OAC_Input_PCT_RATIO_IHS, 2, range_01) # apply range function to columns
# Add the OA codes back as row names
rownames(OAC_Input_PCT_RATIO_IHS_01) <- OAC_Input_PCT_RATIO$OA
In total, this gives us the normalized and standardized variables for each of the social indicator variables included in the original 2011 OAC study for Liverpool. The next step of this analysis requires us to find the ideal number of clusters for our aggregation level.
We can do this by using a total within sum of squares calculation. This is essentially a “mean standardized distance of the data observations to a cluster mean”
library(ggplot2)
# Create a new empty numeric object to store the within sum of squares values
wss <- numeric()
# Run k means analysis for 2-12 clusters and store the wss results
for (i in 2:12) wss[i] <- sum(kmeans(OAC_Input_PCT_RATIO_IHS_01, centers=i,nstart=20)$withinss)
# Create a data frame with the results, adding a further column for the cluster number
wss <- data.frame(2:12,wss[-1])
# Plot
names(wss) <- c("k","Twss")
ggplot(data=wss, aes(x= k, y=Twss)) + geom_path() + geom_point() + scale_x_continuous(breaks=2:12) + labs(y = "Total within sum of squares")
This plot gives us decreases in the within sum of squares value dependent on how many clusters are included in supergroups. The smallest drop appears between 7 and 8, which, according to the original lab doc, was selected for the initial OAS analysis in 2011.
Now that we have a value of 7, we can run our K means using our 7 cluster value.
cluster_7 <- kmeans(x=OAC_Input_PCT_RATIO_IHS_01, centers=7, iter.max=1000000, nstart=10000)
This is also included in the package for this lab, loaded from “./data/cluster_7.Rdata”:
load("C:/Users/Finnmcoolr/Documents/GIS 3/Class Assignments/urban_analytics-master/10_Data_Reduction_Geodemographics/data/cluster_7.Rdata")
This cluster_7 value possesses the preloaded output that would come from a K-means cluster analysis. We can view these using:
str(cluster_7)
## List of 9
## $ cluster : Named int [1:1584] 7 5 7 5 5 7 5 1 1 4 ...
## ..- attr(*, "names")= chr [1:1584] "E00032987" "E00032988" "E00032989" "E00032990" ...
## $ centers : num [1:7, 1:60] 0.553 0.584 0.677 0.666 0.391 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7] "1" "2" "3" "4" ...
## .. ..$ : chr [1:60] "k001" "k002" "k003" "k004" ...
## $ totss : num 2827
## $ withinss : num [1:7] 286 308 250 255 159 ...
## $ tot.withinss: num 1635
## $ betweenss : num 1192
## $ size : int [1:7] 259 340 279 334 109 73 190
## $ iter : int 6
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
From here, we can aggregate the results of this cluster analysis into a lookup table for easier mapping:
lookup <- data.frame(cluster_7$cluster)
# Add OA codes
lookup$OA <- rownames(lookup)
colnames(lookup) <- c("K_7","OA")
# Recode clusters as letter
lookup$SUPER <- LETTERS[lookup$K_7]
We can look at the distribution of areas in each of our 7 clusters:
table(lookup$K_7)
##
## 1 2 3 4 5 6 7
## 259 340 279 334 109 73 190
These values also give us the ability to create a chloropleth map of Liverpool census regions for our K means clusters:
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.5-8, (SVN revision 990)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.0.4, released 2020/01/28
## Path to GDAL shared files: C:/Users/Finnmcoolr/Documents/R/win-library/3.6/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ runtime: Rel. 6.3.1, February 10th, 2020, [PJ_VERSION: 631]
## Path to PROJ shared files: C:/Users/Finnmcoolr/Documents/R/win-library/3.6/rgdal/proj
## Linking to sp version:1.4-2
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
library(tmap)
#Import our OA boundary polygons
liverpool_SP <- readOGR("C:/Users/Finnmcoolr/Documents/GIS 3/Class Assignments/urban_analytics-master/10_Data_Reduction_Geodemographics/data/Liverpool_OA_2011.geojson")
## Warning in OGRSpatialRef(dsn, layer, morphFromESRI = morphFromESRI, dumpSRS =
## dumpSRS, : Discarded datum OSGB_1936 in CRS definition: +proj=tmerc +lat_0=49
## +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +ellps=airy +units=m +no_defs
## OGR data source with driver: GeoJSON
## Source: "C:\Users\Finnmcoolr\Documents\GIS 3\Class Assignments\urban_analytics-master\10_Data_Reduction_Geodemographics\data\Liverpool_OA_2011.geojson", layer: "Liverpool_OA_2011"
## with 1584 features
## It has 1 fields
# Merge lookup
liverpool_SP <- merge(liverpool_SP, lookup, by.x="oa_code",by.y="OA")
m <- tm_shape(liverpool_SP, projection=27700) +
tm_polygons(col="SUPER", border.col = "black", palette="Set1",border.alpha = .3, title="Cluster", showNA=FALSE) +
tm_layout(legend.position = c("left", "bottom"), frame = FALSE, title = "OAS Clusters of Liverpool", title.position = c("right","top"))+
tm_scale_bar()+ tm_compass(north = 0, type = "4star")
#Create leaflet plot
tmap_leaflet(m)
## Compass not supported in view mode.
## Warning in sp::proj4string(obj): CRS object has comment, which is lost in output
## Warning: The shape liverpool_SP is invalid. See sf::st_is_valid
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
## legend.postion is used for plot mode. Use view.legend.position in tm_view to set the legend position in view mode.